{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !rm -rf __MACOSX/news\n",
    "# !rm -rf news\n",
    "# !rm news.zip\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/news/news.zip\n",
    "# !unzip news.zip\n",
    "\n",
    "# !rm -rf __MACOSX\n",
    "# !rm news-30k.json.zip news-30k.json\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/news/news-30k/news-30k.json.zip\n",
    "# !unzip news-30k.json.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from unidecode import unidecode\n",
    "from malaya.text.rules import normalized_chars\n",
    "\n",
    "def filter_news(string):\n",
    "    string = string.lower()\n",
    "    return 'javascript is disabled' in string or 'requires javascript' in string or 'javascript' in string \\\n",
    "    or 'président' in string\n",
    "\n",
    "def make_cleaning(s, c_dict):\n",
    "    s = s.translate(c_dict)\n",
    "    return s\n",
    "\n",
    "def transformer_textcleaning(string):\n",
    "    \"\"\"\n",
    "    use by any transformer model before tokenization\n",
    "    \"\"\"\n",
    "    string = unidecode(string)\n",
    "    string = ' '.join(\n",
    "        [make_cleaning(w, normalized_chars) for w in string.split()]\n",
    "    )\n",
    "    string = re.sub('\\(dot\\)', '.', string)\n",
    "    string = (\n",
    "        re.sub(re.findall(r'\\<a(.*?)\\>', string)[0], '', string)\n",
    "        if (len(re.findall(r'\\<a (.*?)\\>', string)) > 0)\n",
    "        and ('href' in re.findall(r'\\<a (.*?)\\>', string)[0])\n",
    "        else string\n",
    "    )\n",
    "    string = re.sub(\n",
    "        r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n",
    "    )\n",
    "    string = string.replace('\\n', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip().split()\n",
    "    string = [w for w in string if w[0] != '@']\n",
    "    string = [w.title() if w[0].isupper() else w for w in string]\n",
    "    return ' '.join(string)\n",
    "\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 860/860 [00:16<00:00, 51.68it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(120410, 120410)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "news = glob('news/*.json')\n",
    "len(news)\n",
    "\n",
    "before, after = [], []\n",
    "for n in tqdm(news):\n",
    "    with open(n) as fopen:\n",
    "        data = json.load(fopen)\n",
    "    for i in data:\n",
    "        if not filter_news(i['text']) and i['language'] != 'ENGLISH' and len(i['text']) and len(i['title']):\n",
    "            before.append(cleaning(i['text']))\n",
    "            after.append(cleaning(i['title']))\n",
    "            \n",
    "with open('news-30k.json') as fopen:\n",
    "    data = json.load(fopen)\n",
    "    \n",
    "for i in data:\n",
    "    if not filter_news(i['text']) and i['language'] == 'malay' and len(i['text']) and len(i['title']):\n",
    "        before.append(cleaning(i['text']))\n",
    "        after.append(cleaning(i['title']))\n",
    "            \n",
    "len(before), len(after)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('KalbarOnline, Sekadau – Ikatan alumni bintara polisi angkatan ke-27 tahun 2005 gelombang II (ZLD) Polres Sekadau, melakukan aksi bakti sosial dengan membagikan paket sembako kepada warga kurang mampu dan panti asuhan. Sebanyak 25 paket sembako berisi beras, gula, tepung, minyak goreng dan mie instan diserahkan langsung secara door to door kepada warga yang sebelumnya telah didata, untuk menghindari kerumunan. Sasarannya adalah warga kurang mampu atau fakir miskin, lansia, warga terdampak pandemi covid-19 dan anak yatim piatu di panti asuhan Harapan Bunda dan Yayasan Filipi. Bripka Hisbullah Aji selaku koordinator aksi baksos mengatakan, ide pemberian bantuan sembako muncul dari rasa keprihatinan terhadap warga lantaran sulitnya ekonomi ditengah masa pandemi. Terlebih bagi warga yang belum menerima bantuan dari pemerintah. Anggota ZLD Polres Sekadau yang berjumlah 22 personel kemudian mengadakan iuran sukarela yang dikumpulkan sehingga aksi bakti sosial ini dapat terealisasi. Bahkan anggota ZLD yang sudah bertugas diluar Polres Sekadau pun turut andil menyisihkan sebagian rejekinya untuk berbagi dengan sesama. Hal yang mendasari adalah rasa cinta dengan Kota Sekadau, dengan masyarakatnya yang hidup rukun aman dan damai. Dimana angkatan ZLD pertama kali ditugaskan dari Polda Kalbar pada tahun 2006, dan ikut andil membangun Polres Sekadau yang saat itu masih merupakan Kabupaten baru. “Semboyan kami yaitu ‘kami memang tidak Sedarah, tapi lebih dari Saudara’. Kekompakan dan rasa kemanusiaan kami tetap terjalin meskipun jarak berjauhan,” kata Bripka Aji. “Sekadau adalah tempat dimana sejak awal kami bertugas dan hidup berdampingan dengan masyarakat. Sudah selayaknya kami berbagi, meskipun belum seberapa namun besar harapan kami bantuan diberikan bisa tepat sasaran dan meringankan beban warga,” pungkasnya. (Mus)',\n",
       " 'Angkatan ZLD Polres Sekadau Tergerak Berikan Bantuan Sembako Kepada Warga Kurang Mampu')"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "before[0], after[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('news-data.json', 'w') as fopen:\n",
    "    json.dump({'before': before, 'after': after}, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
